First, we'll import some basic libraries, and and our csv dataset.
import numpy as np
import pandas as pd
import os
os.chdir("C:\\Users\\oshapira\\Desktop\\Analytics\\Uber\\data")
##import dataset and create data frame
df_raw = pd.read_csv('rideshare_kaggle.csv')
df = df_raw.copy()
Next, we'll filter on the column features that are of interest for our analysis. We will also drop any 'NA' values, although an alternative step would be to impute these values instead.
###initial filtering of columns desired for analysis
columns_to_keep = ['timestamp', 'hour', 'datetime', 'source', 'destination', 'cab_type', 'name', 'price', 'distance',
'surge', 'temperature','precipIntensity', 'humidity', 'windSpeed']
df = df.filter(items = columns_to_keep)
df = df.dropna() #drop NAs
Some additional features I wanted to evaluate for this regression analysis included time of day, and day of week - neither of which were included in this dataset. These columns were derived by doing the following:
##create new datetime column that is in date_time format
df['datetime_2'] = pd.to_datetime(df.datetime, format="%Y-%m-%d %H:%M:%S") ##convert datetime to datetime format
##add variable for day of week for given ride
##sunday = 6, monday = 0
df['weekday'] = [x.weekday() for x in df['datetime_2']]
dict_weekday = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
df['weekday'] = df['weekday'].map(dict_weekday).fillna(df['weekday'])
from datetime import datetime
####extract precise hour and minute of day from datetime variable
df['time']= [datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in df['datetime']] ##create new timestamp column
df['time'] = [x.strftime("%H%M%S") for x in df['time']] #convert to string format
#df['time_2'] = [str(x) for x in df['time_2']]
df['time'] = [int(x) for x in df['time']] #convert to integer
import matplotlib.pyplot as plt
%matplotlib qt
import seaborn as sns
h, axes = plt.subplots (1,2, figsize=(12,4))
Ux=df.name[df.cab_type=='Uber'].unique()
Lx=df.name[df.cab_type=='Lyft'].unique()
Uy = df.name[df.name=='UberXL'].count(),df.name[df.name=='Black'].count(),\
df.name[df.name=='UberX'].count(),df.name[df.name=='WAV'].count(),\
df.name[df.name=='Black SUV'].count(),df.name[df.name=='UberPool'].count()
Ly=df.name[df.name=='Shared'].count(),df.name[df.name=='Lux'].count(),\
df.name[df.name=='Lyft'].count(),df.name[df.name=='Lux Black XL'].count(),\
df.name[df.name=='Lyft XL'].count(),df.name[df.name=='Lux Black'].count()
vis1= sns.barplot(Ux,np.array(Uy),palette='Accent',ax=axes[0])
vis2= sns.barplot(Lx,np.array(Ly),palette='Accent',ax=axes[1])
axes[0].set_title('Number of Uber Rides')
axes[1].set_title('Number of Lyft Rides')
plt.ioff()
I decided to map the ride types into this data set to be 'Shared', 'Luxury', and 'Standard'. I figured this simplify my analysis while grouping together ride types that should have similar prices.
###create column for ride types so that Uber and Lyft can be more comparable
types=list(df.name.unique())
types_map = {'Shared':'Shared', 'Lux':'Luxury', 'Lyft':'Standard',
'Lux Black XL': 'Luxury', 'Lyft XL':'XL', 'Lux Black':'Luxury', 'UberXL':'XL',
'Black':'Luxury', 'UberX':'Standard', 'WAV':'Luxury', 'Black SUV':'Luxury', 'UberPool':'Shared',
'Taxi':'Luxury'}
df['ride_type'] = df['name'].map(types_map).fillna(df['name'])
df_sample = df.head(n = 100)
I will start by building a class, which includes a function for filter data based on my selected features of interest.
from sklearn.model_selection import train_test_split
from scipy import stats
import statsmodels.api as sm
from scipy.stats import f as fisher_f
from sklearn import metrics
from scipy import stats
from sklearn.linear_model import LinearRegression
import numpy as np
class MyLinearRegression:
def __init__(self, features, cab_type, ride_type): #build instance
self.features= features
self.cab_type = cab_type
self.ride_type = ride_type
def filter_data(self):
self.features = self.features
# features = self.features.insert(0, 'price')
# global df
df_filter = df.loc[(df['ride_type'].isin(self.ride_type)) & (df['cab_type'].isin(self.cab_type))]
df_filter = df_filter.filter(items = self.features)
if any('weekday' in s for s in df_filter.columns):
df_encode = df_filter['weekday']
df_filter = pd.concat([df_filter, pd.get_dummies(df_encode, prefix = 'Day', drop_first = True)], axis = 1)
df_filter.drop(['weekday'], axis=1, inplace = True)
df_filter = df_filter.reset_index(drop = True)
else:
df_filter = df_filter.reset_index(drop = True)
return df_filter
Next, I wanted to get a feel for the correlation between the features of interest I had selected. So I created a scatterplot matrix.
from pandas.plotting import scatter_matrix
def scatter_matrix_plot(variables, cab_type, ride_type):
filtered_df = MyLinearRegression(variables, cab_type, ride_type)
filtered_df = filtered_df.filter_data()
# scatter_matrix(filtered_df, alpha=0.1, figsize=(len(filtered_df.columns), len(filtered_df.columns)), diagonal='kde')
scatter_matrix(filtered_df, alpha=0.1, figsize=(15,15), diagonal='kde')
return plt.show()
scatter_matrix_plot(['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time'], ['Uber'], ['Shared'])
Since there are many functions I would have to repeatedly call for my model testing, I built a class to help keep my code organized.
from sklearn.model_selection import train_test_split
from scipy import stats
import statsmodels.api as sm
from scipy.stats import f as fisher_f
from sklearn import metrics
from scipy import stats
from sklearn.linear_model import LinearRegression
import numpy as np
class MyLinearRegression:
def __init__(self, features, cab_type, ride_type): #build instance
self.features= features
self.cab_type = cab_type
self.ride_type = ride_type
def filter_data(self):
self.features = self.features
# features = self.features.insert(0, 'price')
# global df
df_filter = df.loc[(df['ride_type'].isin(self.ride_type)) & (df['cab_type'].isin(self.cab_type))]
df_filter = df_filter.filter(items = self.features)
if any('weekday' in s for s in df_filter.columns):
df_encode = df_filter['weekday']
df_filter = pd.concat([df_filter, pd.get_dummies(df_encode, prefix = 'Day', drop_first = True)], axis = 1)
df_filter.drop(['weekday'], axis=1, inplace = True)
df_filter = df_filter.reset_index(drop = True)
else:
df_filter = df_filter.reset_index(drop = True)
return df_filter
def split_data(self):
df_filter = self.filter_data()
X = df_filter.loc[:, df_filter.columns != 'price']
Y = df_filter['price']
# X = sm.add_constant(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
return X_train, X_test, y_train, y_test
def create_model(self):
X_train, X_test, y_train, y_test = self.split_data()
model_cab = LinearRegression()
model_cab.fit(X_train, y_train)
return model_cab
def predict_model(self):
X_train, X_test, y_train, y_test = self.split_data()
return_model = self.create_model()
fit_model = return_model.fit(X_train, y_train)
predict = return_model.predict(X_test)
return predict
def linear_formula(self):
X_train, X_test, y_train, y_test = self.split_data()
return_model = self.create_model()
fit_model = return_model.fit(X_train, y_train)
slope = fit_model.coef_[0]
intercept = fit_model.intercept_
print(f"The linear regression line for " + "is y = " + str(round(slope,2))+ "x + " + str(round(intercept,2)))
# def full_linear_formula(self):
# X_train, X_test, y_train, y_test = self.split_data()
# return_model = self.create_model()
# fit_model = return_model.fit(X_train, y_train)
# slope = fit_model.coef_
# intercept = fit_model.intercept_
# return slope
# print(f"The linear regression line for " + "is y = " + str(round(slope,2))+ "x + " + str(round(intercept,2)))
def metric_r2(self):
X_train, X_test, y_train, y_test = self.split_data()
return_model = self.create_model()
fit_model = return_model.fit(X_train, y_train)
r2_score = return_model.score(X_test, y_test)
return round(r2_score,3)
def metric_MAE(self):
predicted = self.predict_model()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= metrics.mean_absolute_error(predicted, y_test)
return round(MAE,3)
def metric_MSE(self):
predicted = self.predict_model()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= metrics.mean_squared_error(predicted, y_test)
return round(MAE,3)
def metric_RMSE(self):
predicted = self.predict_model()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= np.sqrt(metrics.mean_squared_error(predicted, y_test))
return round(MAE,3)
def predict_model_train(self):
X_train, X_test, y_train, y_test = self.split_data()
return_model = self.create_model()
fit_model = return_model.fit(X_train, y_train)
predict = return_model.predict(X_train)
return predict
def metric_r2_train(self):
X_train, X_test, y_train, y_test = self.split_data()
return_model = self.create_model()
fit_model = return_model.fit(X_train, y_train)
r2_score = return_model.score(X_train, y_train)
return round(r2_score,3)
def metric_MAE_train(self):
predicted = self.predict_model_train()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= metrics.mean_absolute_error(predicted, y_train)
return round(MAE,3)
def metric_MSE_train(self):
predicted = self.predict_model_train()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= metrics.mean_squared_error(predicted, y_train)
return round(MAE,3)
def metric_RMSE_train(self):
predicted = self.predict_model_train()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= np.sqrt(metrics.mean_squared_error(predicted, y_train))
return round(MAE,3)
Since I suspect that several features will have an impact on Uber/Lyft prices, I wanted to build a multiple regression model. When doing so, I wanted to ensure that the variables included in this model have a statistically significant impact on price. Therefore, I built a function that inputs all features, removes any features that have a p-value higher than my alpha (in this case 0.05), and keeps repeating this process until none of the features left have a p-value > alpha. My final output will be a summary of the regression model and the variables left, using this "backwards elimination" technique.
def linear_params_test(variables, cab_type, ride_type, sigvalue):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.filter_data()
X = df_filter.loc[:, df_filter.columns != 'price']
# X = df_filter[variables]
Y = df_filter['price']
X = sm.add_constant(X) ##statsmodels library does not add a constant by default
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
# model = sm.OLS(y_train, X_train).fit()
# print_model = model.summary()
lrmodel = sm.OLS(y_train, X_train).fit()
pVals = lrmodel.pvalues
pVals = pVals.to_frame().reset_index(drop = False)
pVals.columns= ['index', 'pvalue']
while max(pVals['pvalue'])>sigvalue:
for p in range(0, len(pVals['pvalue'])):
p_value = pVals['pvalue'][p]
i_max = pVals['pvalue'].values.argmax()
column_drop = pVals['index'][i_max]
X_train.drop([column_drop], axis = 1, inplace = True)
X_test.drop([column_drop], axis = 1, inplace = True)
lrmodel = sm.OLS(y_train,X_train).fit()
pVals= lrmodel.pvalues
pVals = pVals.to_frame().reset_index(drop = False)
pVals.columns= ['index', 'pvalue']
lrmodel = sm.OLS(y_train, X_train).fit()
# summary = lrmodel.summary()
# return summary
return lrmodel
def linear_params_summary(variables, cab_type, ride_type, sigvalue):
lrmodel = linear_params_test(variables, cab_type, ride_type, sigvalue)
summary = lrmodel.summary()
return summary
#jupyter nbextension enable --py widgetsnbextension
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display
# button_output = widgets.Button(description = "Execute")
output = widgets.Output()
other_output = widgets.Output()
cab_type =widgets.SelectMultiple(
options=['Uber', 'Lyft'],
value=['Uber'],
description='Cab Type',
disabled=False
)
ride_type =widgets.SelectMultiple(
options=['Shared', 'Luxury', 'Standard'],
value=['Standard'],
description='Ride Type',
disabled=False
)
sigvalue =widgets.FloatSlider(
min = 0,
max = 1,
step = .05,
description = 'Significance Value Alpha:',
value = 0.05
)
def linear_backwards_elim(cab_type, ride_type, sigvalue):
output.clear_output()
other_output.clear_output()
# linear_params_test(variables, cab_type, ride_type, sigvalue)
# variables = list(variables)
cab_type = list(cab_type)
ride_type = list(ride_type)
with output:
display(cab_type, ride_type, sigvalue)
with other_output:
variables = ['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time']
lrmodel = linear_params_test(variables, cab_type, ride_type, sigvalue)
summary = lrmodel.summary()
display(summary)
def cab_type_event(change):
# variables = ['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time']
linear_backwards_elim(change.new, ride_type.value, sigvalue.value)
def ride_type_event(change):
# variables = ['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time']
linear_backwards_elim(cab_type.value, change.new, sigvalue.value)
def sig_value_event(change):
# variables = ['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time']
linear_backwards_elim(cab_type.value, ride_type.value, change.new)
# variables.observe(variables_event, names = 'value')
cab_type.observe(cab_type_event, names = 'value')
ride_type.observe(ride_type_event, names = 'value')
sigvalue.observe(sig_value_event, names = 'value')
# display(variables)
display(cab_type)
display(ride_type)
display(sigvalue)
# button_output.on_click(scatter_matrix_plot(variables, cab_type, ride_type))
# widgets.VBox([button_output, output])
display(output)
display(other_output)
Next, I tested out polynomial regressions.
############different degrees with regression
import statsmodels.formula.api as smf
def reg_params_poly(variables, cab_type, ride_type, degree):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.split_data()
# return df_filter
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
combined_df = pd.DataFrame(columns = ['y_train', 'x_train'])
combined_df['y_train'] = y_train
combined_df['X_tran'] = X_train
weights = np.polyfit(X_train, y_train, degree)
model = np.poly1d(weights)
return model
def reg_poly_predict(variables, cab_type, ride_type, degree):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.split_data()
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
model = reg_params_poly(variables, cab_type, ride_type, degree)
predict = model(X_test)
return predict
# predict= reg_poly_predict(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
# predict= reg_poly_predict(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],2)
#from sklearn.metrics import mean_squared_error, r2_score
def reg_poly_r2(variables, cab_type, ride_type, degree):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.split_data()
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
predict = reg_poly_predict(variables, cab_type, ride_type, degree)
r2 = round(metrics.r2_score(y_test, predict),3)
return r2
# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],2)
# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],4)
def reg_poly_MAE(variables, cab_type, ride_type, degree):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.split_data()
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
predict = reg_poly_predict(variables, cab_type, ride_type, degree)
MAE = round(metrics.mean_absolute_error(y_test, predict),3)
return MAE
# reg_poly_MAE(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
def reg_poly_MSE(variables, cab_type, ride_type, degree):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.split_data()
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
predict = reg_poly_predict(variables, cab_type, ride_type, degree)
MSE = round(metrics.mean_squared_error(y_test, predict),3)
return MSE
# reg_poly_MSE(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
def reg_poly_RMSE(variables, cab_type, ride_type, degree):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.split_data()
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
predict = reg_poly_predict(variables, cab_type, ride_type, degree)
RMSE = round(np.sqrt(metrics.mean_squared_error(y_test, predict)),3)
return RMSE
def reg_poly_predict_train(variables, cab_type, ride_type, degree):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.split_data()
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
model = reg_params_poly(variables, cab_type, ride_type, degree)
predict = model(X_train)
return predict
# predict= reg_poly_predict(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
# predict= reg_poly_predict(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],2)
#from sklearn.metrics import mean_squared_error, r2_score
def reg_poly_r2_train(variables, cab_type, ride_type, degree):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.split_data()
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
predict = reg_poly_predict_train(variables, cab_type, ride_type, degree)
r2 = round(metrics.r2_score(y_train, predict),3)
return r2
# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],2)
# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],4)
def reg_poly_MAE_train(variables, cab_type, ride_type, degree):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.split_data()
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
predict = reg_poly_predict_train(variables, cab_type, ride_type, degree)
MAE = round(metrics.mean_absolute_error(y_train, predict),3)
return MAE
# reg_poly_MAE(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
def reg_poly_MSE_train(variables, cab_type, ride_type, degree):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.split_data()
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
predict = reg_poly_predict_train(variables, cab_type, ride_type, degree)
MSE = round(metrics.mean_squared_error(y_train, predict),3)
return MSE
reg_poly_MSE(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
def reg_poly_RMSE_train(variables, cab_type, ride_type, degree):
returned_df = MyLinearRegression(variables, cab_type,ride_type)
df_filter = returned_df.split_data()
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
predict = reg_poly_predict_train(variables, cab_type, ride_type, degree)
RMSE = round(np.sqrt(metrics.mean_squared_error(y_train, predict)),3)
return RMSE
Then, I peformed the Chow test
###########chow test
from scipy.stats import f as fisher_f
def chow_test(alpha_val, ride_type):
##get dataset for both Uber + Lyft
all_lin_reg_simple = MyLinearRegression(['price','distance', ], ['Uber', 'Lyft'],['Standard'])
y_test_all = all_lin_reg_simple.predict_model()
predict_all = np.asarray(all_lin_reg_simple.split_data()[3])
##Predictions for Uber Only
uber_lin_reg_simple = MyLinearRegression(['price','distance', ], ['Uber'],['Standard'])
y_test_uber = uber_lin_reg_simple.predict_model()
predict_uber = np.asarray(uber_lin_reg_simple.split_data()[3])
##Predictions for Lyft Only
lyft_lin_reg_simple = MyLinearRegression(['price','distance', ], ['Lyft'],['Standard'])
y_test_lyft = lyft_lin_reg_simple.predict_model()
predict_lyft = np.asarray(lyft_lin_reg_simple.split_data()[3])
##Add up Sum of Squared Errors
SSE_all = sum((np.asarray(y_test_all - predict_all))*(np.asarray(y_test_all - predict_all)))
SSE_lyft = sum((np.asarray(y_test_lyft - predict_lyft))*(np.asarray(y_test_lyft - predict_lyft)))
SSE_uber = sum((np.asarray(y_test_uber - predict_uber))*(np.asarray(y_test_uber - predict_uber)))
N = len(y_test_all)
deg_freedom_all = len(y_test_all) - 1
deg_freedom_1_lyft = len(y_test_lyft) - 1
deg_freedom_2_uber = len(y_test_uber) - 1
k = 2 # one dimensional regression - slope and intercept
numerator = (SSE_all - (SSE_lyft + SSE_uber))/k
denominator = (SSE_lyft + SSE_uber)/(N - 2*k) #both regression have 2 degrees of freedom
f_statistics = numerator/denominator #calculate f-statistic significant value for each months' regressions
alpha = alpha_val
p_value = fisher_f.cdf(f_statistics, 2, N-2*2) ##calculate critical value for degrees 2 and 2
if p_value > alpha:
chow_test = str('The Chow test shows that there is a statistically significant difference between the Uber and Lyft regression lines')
else:
chow_test = str('The Chow test shows that there is not a statistically significant difference between the Uber and Lyft regression lines')
return chow_test
chow_test(.05,['Standard'])
# jupyter nbextension enable --py widgetsnbextension
from plotly.offline import init_notebook_mode, iplot
import plotly.express as px
from plotly.offline import plot
import plotly.graph_objs as go
def lin_reg_cab_df(cab_type, ride_type):
features = ['price', 'distance', 'ride_type', 'cab_type']
df_filter = df.filter(items = features)
df_filter = df_filter.loc[(df_filter['ride_type'].isin(ride_type))]
df_filter_cab = df_filter.loc[(df_filter['cab_type'] == cab_type)]
X_cab = df_filter_cab.loc[:,df_filter_cab.columns == 'distance']
Y_cab = df_filter_cab.loc[:,df_filter_cab.columns == 'price']
X_train_cab, X_test_cab, y_train_cab, y_test_cab = train_test_split(X_cab, Y_cab, test_size = 0.2, random_state = 0)
model_cab = LinearRegression()
model_cab.fit(X_train_cab, y_train_cab)
X_test_cab_df = pd.DataFrame(X_test_cab['distance'].reset_index(drop = True))
y_test_cab_df = pd.DataFrame(y_test_cab['price'].reset_index(drop = True))
cab_predicted = pd.DataFrame(model_cab.predict(X_test_cab))
cab_df = pd.concat([X_test_cab_df, y_test_cab_df, cab_predicted], axis = 1)
cab_df.columns = ['distance', 'actual_price', 'predicted_price']
return cab_df
import plotly.express as px
from plotly.offline import plot
import plotly.graph_objs as go
def plot_lin_reg(ride_type):
uber_df = lin_reg_cab_df('Uber', ride_type)
lyft_df = lin_reg_cab_df('Lyft', ride_type)
# r_model = MyLinearRegression(variables, cab_type, ride_type)
# formula = r_model.linear_formula()
# print(formula)
trace0 = go.Scatter(
x = uber_df['distance'],
y = uber_df['actual_price'],
mode = 'markers',
marker = dict(opacity = 0.5),
name = 'Uber Actual Prices'
)
trace1 = go.Scatter(
x = lyft_df['distance'],
y = lyft_df['actual_price'],
mode = 'markers',
marker = dict(opacity = 0.5),
name = 'Lyft Actual Prices',
)
trace2 = go.Scatter(
x = uber_df['distance'],
y = uber_df['predicted_price'],
mode = 'lines',
name = 'Uber Regression Line'
)
trace3 = go.Scatter(
x = lyft_df['distance'],
y = lyft_df['predicted_price'],
mode = 'lines',
name = 'Lyft Regression Line'
)
data = [trace0, trace1, trace2, trace3]
return iplot(data)
plot_lin_reg(['Standard'])
Then I plotted learning curve
def plot_learning_curves(model, X, y):
reg_simple_df = MyLinearRegression(['price','distance'], ['Uber'],['Standard'])
X_train, X_val, y_train, y_val = reg_simple_df.split_data()
train_errors, val_errors = [], []
for m in range(1, 500):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(metrics.mean_squared_error(y_train[:m], y_train_predict))
val_errors.append(metrics.mean_squared_error(y_val, y_val_predict))
plt.figure(figsize = (15,9))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.title('Learning Curve')
plt.xlabel('Number of Samples')
plt.ylabel('RMSE')
plt.legend(loc = 'upper right')
# plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
# plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
return plt.plot()
X = df['distance']
y = df['price']
lin_reg = LinearRegression()
plot_learning_curves(lin_reg, X, y)
def plot_learning_curves_multi(model, X, y):
reg_simple_df = MyLinearRegression(['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time'], ['Uber'],['Standard'])
X_train, X_val, y_train, y_val = reg_simple_df.split_data()
train_errors, val_errors = [], []
for m in range(1, 500):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(metrics.mean_squared_error(y_train[:m], y_train_predict))
val_errors.append(metrics.mean_squared_error(y_val, y_val_predict))
plt.figure(figsize = (15,9))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.title('Learning Curve')
plt.xlabel('Number of Samples')
plt.ylabel('RMSE')
plt.legend(loc = 'upper right')
return plt.plot()
X = df[['distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time']]
y = df['price']
lin_reg = LinearRegression()
plot_learning_curves_multi(lin_reg, X, y)
def plot_learning_curves_poly(X, y):
reg_simple_df = MyLinearRegression(['price','distance'], ['Uber'],['Standard'])
X_train, X_val, y_train, y_val = reg_simple_df.split_data()
df_filter = reg_simple_df.split_data()
X_train = df_filter[0]['distance'].values
X_test = df_filter[1]['distance'].values
y_train = df_filter[2].values
y_test = df_filter[3].values
combined_df = pd.DataFrame(columns = ['y_train', 'x_train'])
combined_df['y_train'] = y_train
combined_df['X_tran'] = X_train
train_errors, val_errors = [], []
for m in range(1, 500):
model = np.polyfit(X_train[:m], y_train[:m],3)
model = np.poly1d(model)
y_train_predict = model(X_train[:m])
y_val_predict = model(X_val)
train_errors.append(metrics.mean_squared_error(y_train[:m], y_train_predict))
val_errors.append(metrics.mean_squared_error(y_val, y_val_predict))
plt.figure(figsize = (15,9))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.title('Learning Curve')
plt.xlabel('Number of Samples')
plt.ylabel('RMSE')
plt.legend(loc = 'upper right')
return plt.plot()
X = df['distance']
y = df['price']
degree = 3
# poly_reg = LinearRegression()
plot_learning_curves_poly(X, y)
# weights = np.polyfit(X_train, y_train, degree)
# model = np.poly1d(weights)
# model = reg_params_poly(variables, cab_type, ride_type, degree)
# predict = model(X_test)
Then I aggregated the performance metrics of all the linear models I tested.
def performance_metrics(cab_type, ride_type):
#metrics on testing validation set
reg_simple_list = ['Simple Linear Regression']
reg_simple = MyLinearRegression(['price','distance'], cab_type,ride_type)
reg_simple_list.append(reg_simple.metric_r2())
reg_simple_list.append(reg_simple.metric_MAE())
reg_simple_list.append(reg_simple.metric_MSE())
reg_simple_list.append(reg_simple.metric_RMSE())
##metrics on training dataset
reg_simple_list.append(reg_simple.metric_r2_train())
reg_simple_list.append(reg_simple.metric_MAE_train())
reg_simple_list.append(reg_simple.metric_MSE_train())
reg_simple_list.append(reg_simple.metric_RMSE_train())
simple_df = pd.DataFrame(np.asarray(reg_simple_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])
reg_multi_list = ['Multiple Linear Regression']
reg_multi = MyLinearRegression(['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time', 'weekday'], cab_type,ride_type)
reg_multi_list.append(reg_multi.metric_r2())
reg_multi_list.append(reg_multi.metric_MAE())
reg_multi_list.append(reg_multi.metric_MSE())
reg_multi_list.append(reg_multi.metric_RMSE())
reg_multi_list.append(reg_multi.metric_r2_train())
reg_multi_list.append(reg_multi.metric_MAE_train())
reg_multi_list.append(reg_multi.metric_MSE_train())
reg_multi_list.append(reg_multi.metric_RMSE_train())
multi_df = pd.DataFrame(np.asarray(reg_multi_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])
reg_quad_list = ['Quadratic Regression']
reg_quad_list.append(reg_poly_r2(['price','distance'], cab_type,ride_type, 2))
reg_quad_list.append(reg_poly_MAE(['price','distance'], cab_type,ride_type, 2))
reg_quad_list.append(reg_poly_MSE(['price','distance'], cab_type,ride_type, 2))
reg_quad_list.append(reg_poly_RMSE(['price','distance'], cab_type,ride_type, 2))
reg_quad_list.append(reg_poly_r2_train(['price','distance'], cab_type,ride_type, 2))
reg_quad_list.append(reg_poly_MAE_train(['price','distance'], cab_type,ride_type, 2))
reg_quad_list.append(reg_poly_MSE_train(['price','distance'], cab_type,ride_type, 2))
reg_quad_list.append(reg_poly_RMSE_train(['price','distance'], cab_type,ride_type, 2))
quad_df = pd.DataFrame(np.asarray(reg_quad_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])
reg_poly_list = ['3-Degree Polynomial Regression']
reg_poly_list.append(reg_poly_r2(['price','distance'], cab_type,ride_type, 3))
reg_poly_list.append(reg_poly_MAE(['price','distance'], cab_type,ride_type, 3))
reg_poly_list.append(reg_poly_MSE(['price','distance'], cab_type,ride_type, 3))
reg_poly_list.append(reg_poly_RMSE(['price','distance'], cab_type,ride_type, 3))
reg_poly_list.append(reg_poly_r2_train(['price','distance'], cab_type,ride_type, 3))
reg_poly_list.append(reg_poly_MAE_train(['price','distance'], cab_type,ride_type, 3))
reg_poly_list.append(reg_poly_MSE_train(['price','distance'], cab_type,ride_type, 3))
reg_poly_list.append(reg_poly_RMSE_train(['price','distance'], cab_type,ride_type, 3))
poly_df = pd.DataFrame(np.asarray(reg_poly_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])
all_dfs = [simple_df, multi_df, quad_df, poly_df]
all_results_df = pd.concat(all_dfs).reset_index(drop = True)
return all_results_df
performance_metrics(['Uber', 'Lyft'], ['Standard'])
performance_metrics(['Uber'], ['Standard'])
######################KNN REGRESSION##################
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import neighbors
class MyKnnRegression:
def __init__(self, features, cab_type, ride_type, K): #build instance
self.features= features
self.cab_type = cab_type
self.ride_type = ride_type
self.K = K
def filter_data(self):
self.features = self.features
# features = self.features.insert(0, 'price')
# global df
df_filter = df.loc[(df['ride_type'].isin(self.ride_type)) & (df['cab_type'].isin(self.cab_type))]
df_filter = df_filter.filter(items = self.features)
if any('weekday' in s for s in df_filter.columns):
df_encode = df_filter['weekday']
df_filter = pd.concat([df_filter, pd.get_dummies(df_encode, prefix = 'Day', drop_first = True)], axis = 1)
df_filter.drop(['weekday'], axis=1, inplace = True)
df_filter = df_filter.reset_index(drop = True)
else:
df_filter = df_filter.reset_index(drop = True)
return df_filter
def split_data(self):
df_filter = self.filter_data()
X = df_filter.loc[:, df_filter.columns != 'price']
Y = df_filter['price']
# X = sm.add_constant(X)
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
return X_train, X_test, y_train, y_test
def create_model(self):
X_train, X_test, y_train, y_test = self.split_data()
model_cab = neighbors.KNeighborsRegressor(n_neighbors = self.K)
model_cab.fit(X_train, y_train)
return model_cab
def predict_model(self):
X_train, X_test, y_train, y_test = self.split_data()
return_model = self.create_model()
fit_model = return_model.fit(X_train, y_train)
predict = return_model.predict(X_test)
return predict
# def linear_formula(self):
# X_train, X_test, y_train, y_test = self.split_data()
# return_model = self.create_model()
# fit_model = return_model.fit(X_train, y_train)
# slope = fit_model.coef_[0]
# intercept = fit_model.intercept_
# print(f"The linear regression line for " + "is y = " + str(round(slope,2))+ "x + " + str(round(intercept,2)))
def metric_r2(self):
X_train, X_test, y_train, y_test = self.split_data()
return_model = self.create_model()
fit_model = return_model.fit(X_train, y_train)
r2_score = return_model.score(X_test, y_test)
return round(r2_score,3)
def metric_MAE(self):
predicted = self.predict_model()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= metrics.mean_absolute_error(predicted, y_test)
return round(MAE,3)
def metric_MSE(self):
predicted = self.predict_model()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= metrics.mean_squared_error(predicted, y_test)
return round(MAE,3)
def metric_RMSE(self):
predicted = self.predict_model()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= np.sqrt(metrics.mean_squared_error(predicted, y_test))
return round(MAE,3)
def predict_model_train(self):
X_train, X_test, y_train, y_test = self.split_data()
return_model = self.create_model()
fit_model = return_model.fit(X_train, y_train)
predict = return_model.predict(X_train)
return predict
def metric_r2_train(self):
X_train, X_test, y_train, y_test = self.split_data()
return_model = self.create_model()
fit_model = return_model.fit(X_train, y_train)
r2_score = return_model.score(X_train, y_train)
return round(r2_score,3)
def metric_MAE_train(self):
predicted = self.predict_model_train()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= metrics.mean_absolute_error(predicted, y_train)
return round(MAE,3)
def metric_MSE_train(self):
predicted = self.predict_model_train()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= metrics.mean_squared_error(predicted, y_train)
return round(MAE,3)
def metric_RMSE_train(self):
predicted = self.predict_model_train()
model_cab = self.create_model()
X_train, X_test, y_train, y_test = self.split_data()
model_cab.fit(X_train, y_train)
MAE= np.sqrt(metrics.mean_squared_error(predicted, y_train))
return round(MAE,3)
test = MyKnnRegression(['price','distance'], ['Uber'],['Standard'],3)
test = test.predict_model_train()
test
from sklearn.metrics import mean_squared_error
from math import sqrt
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import plot
def knn_regression_filter(variables, cab_type, ride_type):
rmse_val = [] #to store rmse values for different k
k_list = []
for K in range(3,21,2):
K = K
returned_knn = MyKnnRegression(variables, cab_type, ride_type, K)
X_train, X_test, y_train, y_test = returned_knn.split_data()
error = returned_knn.metric_RMSE()
rmse_val.append(error) #store rmse values
k_list.append(K)
# printed = print('RMSE value for k= ' , K , 'is:', error, 'and R-square is ', str(round(r2,2)))
curve = pd.DataFrame(rmse_val,k_list).reset_index(drop = False) #elbow curve
curve.columns = ['k_value', 'rmse']
k_array = np.asarray(k_list)
rmse_val_array = np.asarray(rmse_val)
fig = go.Figure(data = go.Scatter(x= curve['k_value'], y=curve['rmse']))
fig.update_layout(
title = 'Accuracy per number of K Neighbors',
xaxis = dict(
tick0 = 3,
dtick = 2,
title_text = '# of Neighbors'
),
yaxis = dict(
title_text = 'Root Mean Squared Error')
)
return iplot(fig)
knn_regression_filter(['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time', 'weekday'], ['Uber', 'Lyft'], ['Standard'])
def performance_metrics(cab_type, ride_type):
#metrics on testing validation set
reg_simple_list = ['KNN Simple Linear Regression']
reg_simple = MyKnnRegression(['price','distance'], cab_type,ride_type,7)
reg_simple_list.append(reg_simple.metric_r2())
reg_simple_list.append(reg_simple.metric_MAE())
reg_simple_list.append(reg_simple.metric_MSE())
reg_simple_list.append(reg_simple.metric_RMSE())
##metrics on training dataset
reg_simple_list.append(reg_simple.metric_r2_train())
reg_simple_list.append(reg_simple.metric_MAE_train())
reg_simple_list.append(reg_simple.metric_MSE_train())
reg_simple_list.append(reg_simple.metric_RMSE_train())
simple_df = pd.DataFrame(np.asarray(reg_simple_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])
reg_multi_list = ['KNN Multiple Regression']
reg_multi = MyKnnRegression(['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time', 'weekday'], cab_type,ride_type, 7)
reg_multi_list.append(reg_multi.metric_r2())
reg_multi_list.append(reg_multi.metric_MAE())
reg_multi_list.append(reg_multi.metric_MSE())
reg_multi_list.append(reg_multi.metric_RMSE())
reg_multi_list.append(reg_multi.metric_r2_train())
reg_multi_list.append(reg_multi.metric_MAE_train())
reg_multi_list.append(reg_multi.metric_MSE_train())
reg_multi_list.append(reg_multi.metric_RMSE_train())
multi_df = pd.DataFrame(np.asarray(reg_multi_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])
all_dfs = [simple_df, multi_df]
all_results_df = pd.concat(all_dfs).reset_index(drop = True)
return all_results_df
performance_metrics(['Uber', 'Lyft'], ['Standard'])
def plot_learning_curves_knn(X, y, degrees):
reg_simple_df = MyKnnRegression(['price','distance'], ['Uber'],['Standard'],degrees)
X_train, X_val, y_train, y_val = reg_simple_df.split_data()
train_errors, val_errors = [], []
for m in range(1, 500):
# model_cab = neighbors.KNeighborsRegressor(n_neighbors = self.K)
# model_cab.fit(X_train, y_train)
model = neighbors.KNeighborsRegressor(n_neighbors = degrees)
model.fit(X_train[:degrees+m], y_train[:degrees+m])
y_train_predict = model.predict(X_train[:degrees+m])
y_val_predict = model.predict(X_val)
train_errors.append(metrics.mean_squared_error(y_train[:degrees+m], y_train_predict))
val_errors.append(metrics.mean_squared_error(y_val, y_val_predict))
plt.figure(figsize = (15,9))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.title('Learning Curve')
plt.xlabel('Number of Samples')
plt.ylabel('RMSE')
plt.legend(loc = 'upper right')
return plt.plot()
X = df['distance']
y = df['price']
# knn_reg = neighbors.KNeighborsRegressor(n_neighbors = degrees)
# plot_learning_curves_knn(X, y, 3)
plot_learning_curves_knn(X, y, 7)